A <- 1:5
B <- 11:15
names(A) <- A
names(B) <- B
A
B
View(anscombe)
lm(y3~x3, data = anscombe)
##-- now some "magic" to do the 4 regressions in a loop:
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
## or ff[[2]] <- as.name(paste0("y", i))
## ff[[3]] <- as.name(paste0("x", i))
mods[[i]] <- lmi <- lm(ff, data = anscombe)
print(anova(lmi))
}
## See how close they are (numerically!)
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))
# Aggregate function
#Splits the data into subsets, computes summary statistics for each, and returns the result in a convenient form
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")
## Now, do what you should have done in the first place: PLOTS
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))
for(i in 1:4) {
ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
xlim = c(3, 19), ylim = c(3, 13))
abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
x = c(TRUE, FALSE, FALSE)
typeof(x) #logical (vector)
mode(x)
storage.mode(x)
y = 1:10
typeof(y)
mode(y)
storage.mode(y)
dim(y)
length(y)
z= list(1, TRUE, 'safs') #trying to get a list
typeof(z)
class(z)
z[3]
length(z)
dim(z)
quote(x+y)
as.list(quote(x + y))
1e3L #create constant
1L
cat(1+2)
'+'(1, 2)
x <- options()
x$prompt
match(NA, NaN)
match(NA, NA)
match(NaN, NaN)
x = array(1:8, c(2,4))
x
i=2
j=3
x[i]
x[i, j]
x[[i]]
x[[i, j]]
rownames(x)=c("a","b")
x
x = as.data.frame(x)
x
x["a",]
x[]
i <- matrix(1:4, 2, byrow = TRUE)
i
i[2,]
i[2, ,drop=FALSE] # keeping dimension 1 * n when selection a row
dim(i[2,])
dim(i[2, ,drop=FALSE])
# https://cran.r-project.org/doc/manuals/R-intro.pdf
help.start()
x <- rnorm(10000)
y <- rnorm(x)
plot(x, y)
hist(y)
ls()
rm(list=ls())
ls()
x <- 1:20
w <- 1 + sqrt(x)/2
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
dummy
# 4 Ordered and unordered factors
state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
"qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
"sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
"sa", "act", "nsw", "vic", "vic", "act")
statef <- factor(state)
statef
levels(statef)
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
59, 46, 58, 43)
incmeans <- tapply(incomes, statef, mean)
incmeans
# Arrays
a <- array(1:30, dim=c(2, 5,3))
a
x <- array(1:20, dim=c(4,5)) # Generate a 4 by 5 array.
x
i <- array(c(1:3,3:1), dim=c(3,2))
i
x[i] #get the 3 elements shown by i: (3, 1), (2, 2) and (1, 3)
help("<-")
# Back to http://zoonek2.free.fr/UNIX/48_R/02.html
x <- rnorm(10)
x
sort(x)
order(x)
x[order(x)]
x <- sample(1:5, 10, replace=T)
x
x[order(x)]
unique(x)
seq(0,10, length=11) == seq(0,10, by=1)
# Rep
rep(0, 10)
rep(1:5,3)
rep(1:5, each=3)
rep(1:5,2,each=3)
# Factor
x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) )
x
# specify levels
l <- c("Yes", "No", "Perhaps")
x <- factor( sample(l, 5, replace=T), levels=l )
x
table(x)
# gl: Generate Factor Levels
gl(1,4)
gl(2,4)
gl(2,1,8)
gl(2,1,8, labels=c(T,F))
x <- gl(2,4)
x
y <- gl(2,1,8)
y
interaction(x,y)
data.frame(x,y, int=interaction(x,y))
# Cartesian product (toutes les possibilites)
x <- c("A", "B", "C")
y <- 1:2
z <- c("a", "b")
expand.grid(x,y,z)
x <- factor(c(3,4,5,1))
as.numeric( levels(x))
as.numeric( levels(x)[ x ] ) # proper way to convert from factor to numeric
# Data Frames
n <- 10
df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) )
str(df)
summary(df)
names(df);cat(rownames(df))
# Merge
merge(x, y) # INNER JOIN
merge(x, y, all.x = TRUE) # LEFT JOIN
merge(x, y, all.y = TRUE) # RIGHT JOIN
merge(x, y, all = TRUE) # OUTER JOIN
merge(a, b, by=c("y", "z")) # specify what to merge on
Error in fix.by(by.x, x) : 'by' must specify uniquely valid columns
# Regression
data(cars)
#View(cars)
# Regression
lm.fit=lm( dist ~ speed, data=cars, na.action = na.exclude)
lm.fit
# Polynomial regression
lm.fit3 = lm( dist ~ poly(speed,3), data=cars)
#plot(lm.fit)
plot(cars$speed, cars$dist)
abline(lm.fit)
library(rpart.plot)
Loading required package: rpart
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data=train,
method="class",
control=rpart.control(minsplit=2, cp=0))
Error in is.data.frame(data) : object 'train' not found
# Lists
h <- list()
h[["foo"]] <- 1
h[["bar"]] <- c("a", "b", "c")
h["bar"] == h[["bar"]] #h["bar"] is a list containing the vector
# Delete
h[["bar"]] <- NULL
m <- matrix( c(1,2,3,4), nrow=2 )
m
solve(m)
x <- matrix( c(6,7), nrow=2 )
solve(m, x)
?solve
n <- 1000
x1 <- factor( sample(1:3, n, replace=T), levels=1:3 )
x2 <- factor( sample(LETTERS[1:5], n, replace=T), levels=LETTERS[1:5] )
x3 <- factor( sample(c(F,T),n,replace=T), levels=c(F,T) )
d <- data.frame(x1,x2,x3)
r <- table(d)
r
ftable(d) #easier reading
# contingency table into a data.frame
n <- 100
k <- 10
x <- factor( sample(LETTERS[1:k], n, replace=T), levels=LETTERS[1:k] )
x
d <- table(x)
x2 = factor( rep(names(d),d), levels=names(d) )
x2
# apply
options(digits=4)
df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20))
apply(df,2,mean)
rownames(df) <- LETTERS[1:20]
apply(df, 1, mean)
gl(2,10,20)
tapply(1:20, gl(2,10,20), sum) # tapply: 2nd argument used for grouping
by(1:20, gl(2,10,20), sum)
x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1))
sapply(x,sd) # sapply: apply FUN on each element of vector
lapply(x,sd) # lapply: same but returns list
# Exercise: Let x be a boolean vector. Count the number of sequences ("runs") of zeros (for instance, in 00101001010110, there are 6 runs: 00 0 00 0 0 0). Count the number of sequences of 1. Counth the total number of sequences. Same question for a factor with more tham two levels.
n <- 50
x <- sample(0:1, n, replace=T, p=c(.2,.8))
x
diff(x, lag=1)
#Let r be the return of a financial asset. The clustered return is the accumulated return for a sequence of returns of the same sign. The trend number is the number of steps in such a sequence. The average return is their ratio. Compute these quantities.
data(EuStockMarkets)
x <- EuStockMarkets
# We aren't interested in the spot prices, but in the returns
# return[i] = ( price[i] - price[i-1] ) / price[i-1]
y <- apply(x, 2, function (x) { diff(x)/x[-length(x)] })
# We normalize the data
z <- apply(y, 2, function (x) { (x-mean(x))/sd(x) })
# A single time series
r <- z[,1]
# The runs
f <- factor(cumsum(abs(diff(sign(r))))/2)
r <- r[-1]
accumulated.return <- tapply(r, f, sum)
trend.number <- table(f)
boxplot(abs(accumulated.return) ~ trend.number, col='pink',
main="Accumulated return")
# Strings
print("Hello\n")
cat("Hello\n") #use cat
paste("Hello", "World", "!", sep="") #concatenate
paste("Hello ", " World", "!", sep="")
x <- 5
paste("x=", x)
cat("x=", x, "\n", sep="\n")
s <- c("Hello", " ", "World", "!")
paste(s)
paste(s, sep="")
paste(s, collapse="")
paste(1:3, "Hello World!", sep=":")
nchar("Hello World!")
s <- "Hello World"
substring(s, 4, 6)
s <- "foo-->bar-->baz"
strsplit(s, "-->")
# Regex
s <- "foo, bar, baz"
strsplit(s, ", *")
s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")
s
grep("O", s)
grep("O", s, value=T)
regexpr("o", "Hello")
regexpr("o", c("Hello", "World!"))
s <- "foo bar baz"
gsub(" ", "", s) # Remove all the spaces
gsub(" +", " ", s) # Remove multiple spaces and replace them by single spaces
#The "sub" is similar to "gsub" but only replaces the first occurrence.
s <- "foo bar baz"
sub(" ", "", s)
# Dates
as.Date("2005-05-15") #ISO 8601
as.Date("15/05/2005", format="%d/%m/%Y")
as.Date("15/05/05", format="%d/%m/%y")
cat("\n")
as.Date("01/02/03", format="%y/%m/%d")
as.Date("01/02/03", format="%y/%d/%m")
as.Date("01/02/03", format="%y/%m/%d") - as.Date("01/02/03", format="%y/%d/%m")
Sys.Date()
format(Sys.Date(), format="%A, %d %B %Y")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by="month")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by=31)
methods(class="Date")
as.POSIXlt("2005-05-15 21:45:17")
as.POSIXct(Sys.Date())
# Reading from dataframes
# option 1
#d <- read.table("foo.txt")
#d$Date <- as.Date( as.character( d$Date ) )
# option 2
#read.table("foo.txt", colClasses=c("Date", "character", rep(10, "numeric")))
options(warn=1)
methods(plot)
# Import
# d <- read.table("foo.txt", header=T, sep=",")
# d <- read.csv("txt.csv")
# d <- read.csv2("txt.csv") # semicolon-separated file, with a
# # comma instead of the decimal point.
# d <- read.delim("foo.txt") # Tab-delimited file
# d <- read.fwf("txt.fwf") # Fixed width fields
# Excel: this may be trickier: the missing values often appear as "#N/A!" and are mistaken for the start of a comment... You can try
# d <- read.table("foo.csv", header = TRUE, sep = ",",
# na.strings = c("#N/A!", "NA", "@NA"),
# quote = '"',
# comment.char = "")
#If your file only contains number, or only strings, it is wiser to store it in a matrix, not a data.frame. This is what the "scan" function does.
# A numeric matrix
# x <- scan("foo.txt", sep=",") # Gives a numeric vector
# n <- scan("foo.txt", sep=",", nlines=1)
# x <- matrix(x, nc=n)
# A vector of strings
#x <- scan("foo.txt", what=character(0))
# Back tohttps://cran.r-project.org/doc/manuals/R-intro.pdf - Regression
# fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
# fm6 <- update(fm05, . ~ . + x6)
# smf6 <- update(fm6, sqrt(.) ~ .)
# Spine (from help)
require(graphics)
op <- par(mfrow = c(2,1), mgp = c(2,.8,0), mar = 0.1+c(3,3,3,1))
n <- 9
x <- 1:n
y <- rnorm(n)
plot(x, y, main = paste("spline[fun](.) through", n, "points"))
lines(spline(x, y))
lines(spline(x, y, n = 210), col = 2)
# NA handling - http://thomasleeper.com/Rcourse/Tutorials/NA.html
g1 <- c(1, 2, NA, NA, NA, 6, 7)
g2 <- na.omit(g1)
g2
attributes(g2)$na.action
sum(g1)
sum(g1, na.rm = TRUE)
# Cor -> can eliminate only pair-wise NAs ()
x <- c(1, 2, 3, NA, 5, 7, 9)
y <- c(3, 2, 4, 5, 1, 3, 4)
z <- c(NA, 2, 3, 5, 4, 3, 4)
m <- data.frame(x, y, z)
m
cor(m) # returns all NAs
x y z
x 1 NA NA
y NA 1 NA
z NA NA 1
cor(m, use = "complete.obs")
x y z
x 1.0000 0.34819 0.70957
y 0.3482 1.00000 0.04583
z 0.7096 0.04583 1.00000
cor(m, use = "pairwise.complete.obs")
x y z
x 1.0000 0.2498 0.7096
y 0.2498 1.0000 0.4534
z 0.7096 0.4534 1.0000
# Defaut for lm is also casewise deletion
lm <- lm(y ~ x + z, data = m)
summary(lm)
Call:
lm(formula = y ~ x + z, data = m)
Residuals:
2 3 5 6 7
-0.632 1.711 -1.237 -0.447 0.605
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.316 3.399 0.98 0.43
x 0.289 0.408 0.71 0.55
z -0.632 1.396 -0.45 0.70
Residual standard error: 1.65 on 2 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.203, Adjusted R-squared: -0.594
F-statistic: 0.254 on 2 and 2 DF, p-value: 0.797
# Checking length of data used for regression
length(m$y)
[1] 7
length(lm$fitted)
[1] 5
# checking where missing data is
is.na(m) # only useful for small examples
x y z
[1,] FALSE FALSE TRUE
[2,] FALSE FALSE FALSE
[3,] FALSE FALSE FALSE
[4,] TRUE FALSE FALSE
[5,] FALSE FALSE FALSE
[6,] FALSE FALSE FALSE
[7,] FALSE FALSE FALSE
# image
image(is.na(m), main = "Missing Values", xlab = "Observation", ylab = "Variable",
xaxt = "n", yaxt = "n", bty = "n")
axis(1, seq(0, 1, length.out = nrow(m)), 1:nrow(m), col = "white")
axis(2, c(0, 0.5, 1), names(m), col = "white", las = 2)

# to remove casewise:
m2 <- na.omit(m) # use new variable to keep original dataframe
m
m2
# Mean and Random imputation
x2 <- x
x2[is.na(x2)] <- mean(x2, na.rm = TRUE)
x
[1] 1 2 3 NA 5 7 9
x2
[1] 1.0 2.0 3.0 4.5 5.0 7.0 9.0
# Random imputation - conserve mean and variance. Sample rest of the values to fill NAs
x3 <- x
x3[is.na(x3)] <- sample(x3[!is.na(x3)], sum(is.na(x3)), TRUE)
x
[1] 1 2 3 NA 5 7 9
x3
[1] 1 2 3 5 5 7 9
# Saving R data http://thomasleeper.com/Rcourse/Tutorials/savingdata.html
set.seed(1)
mydf <- data.frame(x = rnorm(10), y = rnorm(10), z = rnorm(10))
save(mydf, file = "saveddf.RData")
unlink("saveddf.RData")
# dput to have a readable format (e.g. for stack overflow)
dput(mydf)
structure(list(x = c(-0.626453810742332, 0.183643324222082, -0.835628612410047,
1.59528080213779, 0.329507771815361, -0.820468384118015, 0.487429052428485,
0.738324705129217, 0.575781351653492, -0.305388387156356), y = c(1.51178116845085,
0.389843236411431, -0.621240580541804, -2.2146998871775, 1.12493091814311,
-0.0449336090152309, -0.0161902630989461, 0.943836210685299,
0.821221195098089, 0.593901321217509), z = c(0.918977371608218,
0.782136300731067, 0.0745649833651906, -1.98935169586337, 0.61982574789471,
-0.0561287395290008, -0.155795506705329, -1.47075238389927, -0.47815005510862,
0.417941560199702)), .Names = c("x", "y", "z"), row.names = c(NA,
-10L), class = "data.frame")
dput(mydf, "saveddf.txt")
mydf2 <- dget("saveddf.txt")
mydf2
mydf==mydf2
x y z
[1,] FALSE FALSE FALSE
[2,] FALSE FALSE FALSE
[3,] FALSE FALSE TRUE
[4,] FALSE TRUE FALSE
[5,] FALSE FALSE FALSE
[6,] FALSE FALSE FALSE
[7,] FALSE FALSE FALSE
[8,] FALSE FALSE FALSE
[9,] FALSE FALSE FALSE
[10,] TRUE FALSE FALSE
unlink("saveddf.text")
# csv
write.csv(mydf, file = "saveddf.csv")
unlink("savedf.csv")
# Dataframe rearrangement
set.seed(50)
mydf <- data.frame(a = rep(1:2, each = 10), b = rep(1:4, times = 5), c = rnorm(20),
d = rnorm(20), e = sample(1:20, 20, FALSE))
head(mydf)
# manual order change
head(mydf[, c("c", "d", "e", "a", "b")])
# mydf <- mydf[, c(3, 4, 5, 1, 2)]
# using order
order(mydf$e)
[1] 2 16 18 19 20 14 3 4 5 12 1 11 13 15 7 8 10 9 6 17
head(mydf[order(mydf$e), ])
# Subset
mydf[mydf$a == 1, ]
mydf[mydf$a == 1 & mydf$b > 2, ]
subset(mydf, a == 1 & b > 2)
subset(mydf, select = c("a", "b"))
# Splitting
split(mydf, mydf$a)
$`1`
$`2`
NA
split(mydf, list(mydf$a, mydf$b))
$`1.1`
$`2.1`
$`1.2`
$`2.2`
$`1.3`
$`2.3`
$`1.4`
$`2.4`
lapply(split(mydf, mydf$a), summary)
$`1`
a b c d e
Min. :1 Min. :1.00 Min. :-1.728 Min. :-1.590 Min. : 1.00
1st Qu.:1 1st Qu.:1.25 1st Qu.:-0.779 1st Qu.:-0.359 1st Qu.: 8.25
Median :1 Median :2.00 Median :-0.122 Median : 0.193 Median :13.00
Mean :1 Mean :2.30 Mean :-0.244 Mean : 0.299 Mean :12.10
3rd Qu.:1 3rd Qu.:3.00 3rd Qu.: 0.483 3rd Qu.: 0.568 3rd Qu.:16.75
Max. :1 Max. :4.00 Max. : 0.976 Max. : 2.668 Max. :19.00
$`2`
a b c d e
Min. :2 Min. :1.00 Min. :-1.166 Min. :-1.1304 Min. : 2.00
1st Qu.:2 1st Qu.:2.00 1st Qu.:-0.488 1st Qu.:-0.6338 1st Qu.: 4.25
Median :2 Median :3.00 Median :-0.343 Median : 0.2961 Median : 8.00
Mean :2 Mean :2.70 Mean :-0.268 Mean : 0.0793 Mean : 8.90
3rd Qu.:2 3rd Qu.:3.75 3rd Qu.: 0.108 3rd Qu.: 0.4137 3rd Qu.:12.75
Max. :2 Max. :4.00 Max. : 0.555 Max. : 1.8397 Max. :20.00
lapply(split(mydf, mydf$a), summary)
$`1`
a b c d e
Min. :1 Min. :1.00 Min. :-1.728 Min. :-1.590 Min. : 1.00
1st Qu.:1 1st Qu.:1.25 1st Qu.:-0.779 1st Qu.:-0.359 1st Qu.: 8.25
Median :1 Median :2.00 Median :-0.122 Median : 0.193 Median :13.00
Mean :1 Mean :2.30 Mean :-0.244 Mean : 0.299 Mean :12.10
3rd Qu.:1 3rd Qu.:3.00 3rd Qu.: 0.483 3rd Qu.: 0.568 3rd Qu.:16.75
Max. :1 Max. :4.00 Max. : 0.976 Max. : 2.668 Max. :19.00
$`2`
a b c d e
Min. :2 Min. :1.00 Min. :-1.166 Min. :-1.1304 Min. : 2.00
1st Qu.:2 1st Qu.:2.00 1st Qu.:-0.488 1st Qu.:-0.6338 1st Qu.: 4.25
Median :2 Median :3.00 Median :-0.343 Median : 0.2961 Median : 8.00
Mean :2 Mean :2.70 Mean :-0.268 Mean : 0.0793 Mean : 8.90
3rd Qu.:2 3rd Qu.:3.75 3rd Qu.: 0.108 3rd Qu.: 0.4137 3rd Qu.:12.75
Max. :2 Max. :4.00 Max. : 0.555 Max. : 1.8397 Max. :20.00
# sampling
s <- sample(1:nrow(mydf), 5, F) #no replacement
s
[1] 18 1 12 15 6
mydf[s,]
# test set
mydf[-s, ]
# Option 2
s2 <- rbinom(nrow(mydf), 1, 0.2)
s2
[1] 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
mydf[s2,]
mydf[-s2,]
library(car)
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
b <- 1:20
#h <- recode(b, "1:5=1: 6:10=2; else=NA") # incredibly this creates an error
e <- recode(b, "1:5=1; 6:10=2; else=NA")
e
[1] 1 1 1 1 1 2 2 2 2 2 NA NA NA NA NA NA NA NA NA NA
f <- recode(b, "lo:5=1; 6:10=2; 11:15=3; 16:hi=4; else=NA")
f
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4
e <- recode(h, "NA=99")
e
[1] 1 2 3 4 5 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
# Reconding on multiple variables
i <- expand.grid(1:4, 1:2)
i
interaction(i$Var1, i$Var2) # creates all possible combinations
[1] 1.1 2.1 3.1 4.1 1.2 2.2 3.2 4.2
Levels: 1.1 2.1 3.1 4.1 1.2 2.2 3.2 4.2
set.seed(1)
n <- 30
mydf <- data.frame(x1 = rbinom(n, 1, 0.5), x2 = rbinom(n, 1, 0.1), x3 = rbinom(n,
1, 0.5), x4 = rbinom(n, 1, 0.8), x5 = 1, x6 = sample(c(0, 1, NA), n, TRUE))
str(mydf)
'data.frame': 30 obs. of 6 variables:
$ x1: int 0 0 1 1 0 1 1 1 1 0 ...
$ x2: int 0 0 0 0 0 0 0 0 0 0 ...
$ x3: int 1 0 0 0 1 0 0 1 0 1 ...
$ x4: int 1 1 1 0 1 1 1 1 0 1 ...
$ x5: num 1 1 1 1 1 1 1 1 1 1 ...
$ x6: num NA 1 1 0 NA 1 1 0 0 1 ...
mydf$x1 + mydf$x2 - mydf$x3 # vector operations
[1] -1 0 1 1 -1 1 1 0 1 -1 0 -1 1 0 1 -1 0 1 -1 0 1 -1 1 0 -1 0 -1 0 1
[30] 0
with(mydf, x1+x2-x3)
[1] -1 0 1 1 -1 1 1 0 1 -1 0 -1 1 0 1 -1 0 1 -1 0 1 -1 1 0 -1 0 -1 0 1
[30] 0
rowSums(mydf)
[1] NA 3 4 2 NA 4 4 4 2 4 3 3 3 2 NA 4 5 4 NA 5 NA 4 3 2 NA 3 3 NA 3
[30] NA
rowSums(mydf, na.rm = T)
[1] 3 3 4 2 3 4 4 4 2 4 3 3 3 2 3 4 5 4 2 5 2 4 3 2 3 3 3 2 3 2
data.frame(1:n, rowSums(mydf, na.rm = T))
rowMeans(mydf)
[1] NA 0.5000 0.6667 0.3333 NA 0.6667 0.6667 0.6667 0.3333 0.6667 0.5000 0.5000
[13] 0.5000 0.3333 NA 0.6667 0.8333 0.6667 NA 0.8333 NA 0.6667 0.5000 0.3333
[25] NA 0.5000 0.5000 NA 0.5000 NA
apply(mydf, 1, var) # 2nd argument: 1 for rows, 2 for columns, c(1, 2) rows & columns.
[1] NA 0.3000 0.2667 0.2667 NA 0.2667 0.2667 0.2667 0.2667 0.2667 0.3000 0.3000
[13] 0.3000 0.2667 NA 0.2667 0.1667 0.2667 NA 0.1667 NA 0.2667 0.3000 0.2667
[25] NA 0.3000 0.3000 NA 0.3000 NA
apply(mydf, 2, var)
x1 x2 x3 x4 x5 x6
0.2575 0.0000 0.2483 0.1437 0.0000 NA
sapply(mydf, var) # over list or vector
x1 x2 x3 x4 x5 x6
0.2575 0.0000 0.2483 0.1437 0.0000 NA
newvar
[1] 3 2 0 0 3 0 0 1 0 3 2 3 0 1 0 3 1 0 2 1 0 3 0 2 3 2 3 2 0 2
newvar[mydf$x1 == 1] <- with(mydf, x2 + x3)
number of items to replace is not a multiple of replacement length
# Matrices
set.seed(1)
a <- rnorm(100)
quantile(a, c(0.025, 0.975))
2.5% 97.5%
-1.671 1.797
quantile(a, seq(0, 1, by = 0.1))
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
-2.2147 -1.0527 -0.6139 -0.3753 -0.0767 0.1139 0.3771 0.5812 0.7713 1.1811 2.4016
summary(as.logical(rbinom(1000, 1, 0.5)))
Mode FALSE TRUE
logical 492 508
summary(factor(a))
-2.2146998871775 -1.98935169586337 -1.80495862889104 -1.52356680042976
1 1 1 1
-1.47075238389927 -1.37705955682861 -1.27659220845804 -1.2536334002391
1 1 1 1
-1.22461261489836 -1.12936309608079 -1.04413462631653 -0.934097631644252
1 1 1 1
-0.835628612410047 -0.820468384118015 -0.743273208882405 -0.709946430921815
1 1 1 1
-0.70749515696212 -0.68875569454952 -0.626453810742332 -0.621240580541804
1 1 1 1
-0.612026393250771 -0.589520946188072 -0.573265414236886 -0.568668732818502
1 1 1 1
-0.54252003099165 -0.47815005510862 -0.473400636439312 -0.443291873218433
1 1 1 1
-0.41499456329968 -0.394289953710349 -0.367221476466509 -0.305388387156356
1 1 1 1
-0.304183923634301 -0.253361680136508 -0.164523596253587 -0.155795506705329
1 1 1 1
-0.135178615123832 -0.135054603880824 -0.112346212150228 -0.102787727342996
1 1 1 1
-0.0593133967111857 -0.0561287395290008 -0.0538050405829051 -0.0449336090152309
1 1 1 1
-0.0392400027331692 -0.0161902630989461 0.00110535163162413 0.0280021587806661
1 1 1 1
0.0743413241516641 0.0745649833651906 0.153253338211898 0.183643324222082
1 1 1 1
0.188792299514343 0.267098790772231 0.291446235517463 0.329507771815361
1 1 1 1
0.332950371213518 0.341119691424425 0.36458196213683 0.370018809916288
1 1 1 1
0.387671611559369 0.389843236411431 0.398105880367068 0.417941560199702
1 1 1 1
0.475509528899663 0.487429052428485 0.556663198673657 0.558486425565304
1 1 1 1
0.569719627442413 0.575781351653492 0.593901321217509 0.593946187628422
1 1 1 1
0.610726353489055 0.61982574789471 0.689739362450777 0.696963375404737
1 1 1 1
0.700213649514998 0.738324705129217 0.763175748457544 0.768532924515416
1 1 1 1
0.782136300731067 0.821221195098089 0.881107726454215 0.918977371608218
1 1 1 1
0.943836210685299 1.06309983727636 1.10002537198388 1.12493091814311
1 1 1 1
1.16040261569495 1.1780869965732 1.20786780598317 1.35867955152904
1 1 1 1
1.43302370170104 1.46555486156289 1.51178116845085 1.58683345454085
1 1 1 1
1.59528080213779 1.98039989850586 2.17261167036215 2.40161776050478
1 1 1 1
# Tables
set.seed(1)
a <- sample(1:5, 25, T)
a
[1] 2 2 3 5 2 5 5 4 4 1 2 1 4 2 4 3 4 5 2 4 5 2 4 1 2
table(a)
a
1 2 3 4 5
3 8 2 7 5
prop.table(table(a)) # to obtain percentages
a
1 2 3 4 5
0.12 0.32 0.08 0.28 0.20
prop.table(table(a)) *100
a
1 2 3 4 5
12 32 8 28 20
cbind(table(a), prop.table(table(a))*100)
[,1] [,2]
1 3 12
2 8 32
3 2 8
4 7 28
5 5 20
# multi-variate
b <- rep(c(1, 2), length = 25)
table(a, b)
b
a 1 2
1 0 3
2 5 3
3 1 1
4 5 2
5 2 3
c <- rep(c(3, 4, 5), length = 25)
table(a, b, c)
, , c = 3
b
a 1 2
1 0 1
2 3 1
3 0 1
4 1 0
5 1 1
, , c = 4
b
a 1 2
1 0 0
2 2 2
3 0 0
4 2 2
5 0 0
, , c = 5
b
a 1 2
1 0 2
2 0 0
3 1 0
4 2 0
5 1 2
ftable(a, c, c)
c 3 4 5
a c
1 3 1 0 0
4 0 0 0
5 0 0 2
2 3 4 0 0
4 0 4 0
5 0 0 0
3 3 1 0 0
4 0 0 0
5 0 0 1
4 3 1 0 0
4 0 4 0
5 0 0 2
5 3 2 0 0
4 0 0 0
5 0 0 3
xtabs(~a + b)
b
a 1 2
1 0 3
2 5 3
3 1 1
4 5 2
5 2 3
x <- table(a, b)
addmargins(x)
b
a 1 2 Sum
1 0 3 3
2 5 3 8
3 1 1 2
4 5 2 7
5 2 3 5
Sum 13 12 25
prop.table(table(a, b), 1) # proportions by rows
b
a 1 2
1 0.0000000 1.0000000
2 0.6250000 0.3750000
3 0.5000000 0.5000000
4 0.7142857 0.2857143
5 0.4000000 0.6000000
prop.table(table(a, b), 2)
b
a 1 2
1 0.00000000 0.25000000
2 0.38461538 0.25000000
3 0.07692308 0.08333333
4 0.38461538 0.16666667
5 0.15384615 0.25000000
# Correlations
set.seed(1)
n <- 1000
x1 <- rnorm(n, -1, 10)
x2 <- rnorm(n, 3, 2)
y <- 5 * x1 + x2 + rnorm(n, 1, 2)
cor(x1, x2)
[1] 0.006401211
cor.test(x1, x2)
Pearson's product-moment correlation
data: x1 and x2
t = 0.20223, df = 998, p-value = 0.8398
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.05561394 0.06836716
sample estimates:
cor
0.006401211
cor(cbind(x1, x2, y))
x1 x2 y
x1 1.000000000 0.006401211 0.99837564
x2 0.006401211 1.000000000 0.04731214
y 0.998375645 0.047312138 1.00000000
a <- rnorm(n)
b <- a^2 + rnorm(n)
plot(b~a)

plot(b ~ a, col = "gray")
curve((x), col = "red", add = TRUE)
curve((x^2), col = "blue", add = TRUE)

cor(a, b)
[1] -0.01712362
cor(a^2, b)
[1] 0.8430192
plot(b~I(a^2), col = "orange")
abline(lm(b~I(a^2)), col = "red")

layout(matrix(1:2, nrow = 1))
plot(b ~ a, col = "gray")
curve((x^2), col = "blue", add = TRUE)
plot(b ~ I(a^2), col = "gray")
curve((x), col = "blue", add = TRUE)

# Rounding
height <- c(167, 164, 172, 158, 181, 179)
mean(height)
[1] 170.1667
signif(mean(height), 4)
[1] 170.2
round(mean(height), 1)
[1] 170.2
round(mean(height), -2)
[1] 200
options(digits = 5)
sd(height)
[1] 8.8863
options(digits = 2)
sd(height)
[1] 8.9
options(scipen = -10)
10000000
[1] 1e+07
# sprintf
sprintf("%.3f", pi)
[1] "3.142"
sprintf("%05.1f", pi)
[1] "003.1"
# Plots as data summary
set.seed(1)
a <- rnorm(30)
hist(a, col = "gray20", border = "lightgray")

density(a)
Call:
density.default(x = a)
Data: a (30 obs.); Bandwidth 'bw' = 0.3891
x y
Min. :-3.4e+00 Min. :0.0e+00
1st Qu.:-1.8e+00 1st Qu.:3.0e-02
Median :-3.0e-01 Median :9.0e-02
Mean :-3.0e-01 Mean :1.6e-01
3rd Qu.: 1.2e+00 3rd Qu.:2.9e-01
Max. : 2.8e+00 Max. :4.6e-01
plot(density(a))

hist(a, freq = FALSE, col = "gray20", border = "lightgray")
lines(density(a), col = "red", lwd = 2)

b <- c(3, 4.5, 5, 8, 3, 6)
barplot(b, names.arg = letters[1:length(b)], horiz = F)

d <- rbind(c(2, 4, 1), c(6, 1, 3))
d
[,1] [,2] [,3]
[1,] 2e+00 4e+00 1e+00
[2,] 6e+00 1e+00 3e+00
barplot(d, names.arg = letters[1:3])

barplot(d, beside = T)

layout(matrix(1:2, nrow = 1))
barplot(b, names.arg = letters[1:6], horiz = TRUE, las = 2)
dotchart(b, labels = letters[1:6], xlim = c(0, 8))

boxplot(a)

e <- rnorm(100, 1, 1)
f <- rnorm(100, 2, 4)
boxplot(e, f)

g1 <- c(e, f)
g2 <- rep(c(1, 2), each = 100)
boxplot(g1 ~ g2)

# Scatterplot
x1 <- rnorm(1000)
x2 <- rnorm(1000)
x3 <- x1 + x2
x4 <- x1 + x3
plot(x1, x2)

plot(x2~x1)

layout(matrix(1:3, nrow = 1))
plot(x1, x2)
plot(x1, x3)
plot(x1, x4)

pairs(~x1 + x2 + x3 + x4)

colors()[1:10]
[1] "white" "aliceblue" "antiquewhite" "antiquewhite1" "antiquewhite2"
[6] "antiquewhite3" "antiquewhite4" "aquamarine" "aquamarine1" "aquamarine2"
length(colors())
[1] 657
colors()[600]
[1] "slategray1"
set.seed(100)
z <- sample(1:4, 100, TRUE)
x <- rnorm(100)
y <- rnorm(100)
plot(x, y, pch = 15, col = c("red", "blue"))

c("red", "blue", "green", "orange")[z]
[1] "blue" "blue" "green" "red" "blue" "blue" "orange" "blue" "green"
[10] "red" "green" "orange" "blue" "blue" "orange" "green" "red" "blue"
[19] "blue" "green" "green" "green" "green" "green" "blue" "red" "orange"
[28] "orange" "green" "blue" "blue" "orange" "blue" "orange" "green" "orange"
[37] "red" "green" "orange" "red" "blue" "orange" "orange" "orange" "green"
[46] "blue" "orange" "orange" "red" "blue" "blue" "red" "red" "blue"
[55] "green" "blue" "red" "red" "green" "red" "blue" "green" "orange"
[64] "green" "blue" "blue" "blue" "blue" "red" "green" "blue" "blue"
[73] "green" "orange" "green" "green" "orange" "orange" "orange" "red" "blue"
[82] "green" "orange" "orange" "red" "green" "green" "red" "blue" "green"
[91] "orange" "red" "blue" "blue" "orange" "blue" "green" "red" "red"
[100] "orange"
plot(x, y, pch = 15, col = c("red", "blue", "green", "orange")[z]) #indexing colors on z groups

# Analysis of variance (ANOVA)
set.seed(100)
tr <- rep(1:4, each = 30)
y <- numeric(length = 120)
y[tr == 1] <- rnorm(30, 5, 1)
y[tr == 2] <- rnorm(30, 4, 2)
y[tr == 3] <- rnorm(30, 4, 5)
y[tr == 4] <- rnorm(30, 1, 2)
aov(y~tr)
Call:
aov(formula = y ~ tr)
Terms:
tr Residuals
Sum of Squares 2.5e+02 1.2e+03
Deg. of Freedom 1.0e+00 1.2e+02
Residual standard error: 3.2e+00
Estimated effects may be unbalanced
summary(aov(y ~ factor(tr)))
Df Sum Sq Mean Sq F value Pr(>F)
factor(tr) 3.00e+00 2.97e+02 9.9e+01 9.98e+00 6.6e-06 ***
Residuals 1.16e+02 1.15e+03 9.9e+00
---
Signif. codes: 0e+00 ‘***’ 1e-03 ‘**’ 1e-02 ‘*’ 5e-02 ‘.’ 1e-01 ‘ ’ 1e+00
oneway.test(y ~ tr)
One-way analysis of means (not assuming equal variances)
data: y and tr
F = 4e+01, num df = 3e+00, denom df = 5e+01, p-value = 1e-13
oneway.test(y ~ factor(tr), var.equal = TRUE)
One-way analysis of means
data: y and factor(tr)
F = 1e+01, num df = 3e+00, denom df = 1e+02, p-value = 7e-06
by(y, tr, FUN = mean)
tr: 1
[1] 5e+00
------------------------------------------------------------------------------------
tr: 2
[1] 4.2e+00
------------------------------------------------------------------------------------
tr: 3
[1] 3.8e+00
------------------------------------------------------------------------------------
tr: 4
[1] 8.5e-01
tapply(y, tr, FUN = mean) # same thing
1 2 3 4
5.0e+00 4.2e+00 3.8e+00 8.5e-01
# Distributions
options(scipen = F)
options(digits = 5)
dnorm(0) # density
[1] 0.39894
dnorm(0, mean=-1)
[1] 0.24197
pnorm(0) # cumulative
[1] 0.5
pnorm(1.65) # 95% normal confidence interval
[1] 0.95053
pnorm(1.96)
[1] 0.975
pnorm(1.96) - pnorm(-1.96)
[1] 0.95
qnorm(c(0.025, 0.975)) # quantile
[1] -1.96 1.96
pnorm(qnorm(c(0.025, 0.975)))
[1] 0.025 0.975
# other distribution
dbinom(0, 1, 0.5)
[1] 0.5
pbinom(0, 1, 0.5)
[1] 0.5
qbinom(.95, 1, 0.5)
[1] 1
# Formulae
myformula <- ~x
class(myformula)
[1] "formula"
# interactions
y ~ x1 * x2
y ~ x1 * x2
# As strings
("y ~ x") == (y ~ x)
[1] TRUE
as.formula("y~x")
y ~ x
as.character(y ~ x)
[1] "~" "y" "x"
terms(y ~ x1 + x2)
y ~ x1 + x2
attr(,"variables")
list(y, x1, x2)
attr(,"factors")
x1 x2
y 0 0
x1 1 0
x2 0 1
attr(,"term.labels")
[1] "x1" "x2"
attr(,"order")
[1] 1 1
attr(,"intercept")
[1] 1
attr(,"response")
[1] 1
attr(,".Environment")
<environment: R_GlobalEnv>
update(y ~ x, ~. + x2)
y ~ x + x2
update(y ~ x, z ~ .)
z ~ x
# Bivariate Regression
set.seed(1)
bin <- rbinom(1000, 1, 0.5)
out <- 2 * bin + rnorm(1000)
by(out, bin, mean)
bin: 0
[1] -0.015881
---------------------------------------------------------------------
bin: 1
[1] 1.9662
t.test(out ~ bin)
Welch Two Sample t-test
data: out by bin
t = -30.3, df = 993, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2.1105 -1.8537
sample estimates:
mean in group 0 mean in group 1
-0.015881 1.966237
lm(out ~ bin)
Call:
lm(formula = out ~ bin)
Coefficients:
(Intercept) bin
-0.0159 1.9821
summary(lm(out~bin))$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.015881 0.045341 -0.35027 7.2621e-01
bin 1.982119 0.065444 30.28724 1.9487e-143
plot(out ~ bin, col = "gray")
points(0:1, by(out, bin, mean), col = "blue", bg = "blue", pch = 23)
abline(coef(lm(out ~ bin)), col = "blue")

set.seed(1)
x <- runif(1000, 0, 10)
y <- 3 * x + rnorm(1000, 0, 5)
# glm plots
set.seed(1)
n <- 100
x <- runif(n, 0, 1)
y <- rbinom(n, 1, x)
plot(y ~ x, col = NULL, bg = rgb(0, 0, 0, 0.5), pch = 21) # bg: background color
abline(lm(y ~ x), lwd = 2) # lwd: line width (default: 1)

m1 <- glm(y ~ x, family = binomial(link = "logit"))
newdf <- data.frame(x = seq(0, 1, length.out = 100))
newdf
newdf$pout_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$fit
newdf[95:100,]
# build confidence intervals from standard error
newdf$pse_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$se.fit
newdf$plower_logit <- newdf$pout_logit - (1.96 * newdf$pse_logit) # 95% CI lower bound
newdf$pupper_logit <- newdf$pout_logit + (1.96 * newdf$pse_logit) # 95% CI upper bound
# qnorm(c(0.025, 0.975)) = (-1.96, +1.96)
newdf[1:10,c(1,2,3,5,4)]
Error: object 'newdf' not found
# now plot predicted values
with(newdf, plot(pout_logit ~ x, type = "l", lwd = 2))
with(newdf, lines(pupper_logit ~ x, type = "l", lty = 2))
with(newdf, lines(plower_logit ~ x, type = "l", lty = 2))


head(mpg)
#g <- ggplot(data = mpg, aes(x = displ, y = hwy))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(method = "lm") # with regression

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(size = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(shape = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(alpha = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cyl)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ .)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ cyl)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_wrap( ~ class)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth()

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE) # Turn off confidence band

ggplot(data = mpg, aes(x = class, y = hwy)) + geom_boxplot() # scatterplot

ggplot(data = mpg, aes(x = reorder(class, hwy), y = hwy)) + geom_boxplot() # reorder (mean)

ggplot(data = mpg, aes(x = reorder(class, hwy, median), y = hwy)) + geom_boxplot() # reorder by median

# jitter
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point()

ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point(position = "jitter")

ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_jitter() # identical option

names(diamonds) # part of ggplot2 package
[1] "carat" "cut" "color" "clarity" "depth" "table" "price" "x" "y" "z"
ggplot(data = diamonds,aes(x =cut)) + geom_bar(aes(fill =cut)) # fill: color inside of bars

ggplot(data = diamonds, aes(x =cut)) + geom_bar(aes(color =cut)) # color: line around the bars

ggplot(data = diamonds, aes(x = color)) + geom_bar(aes(fill = cut), position = "dodge")

str(diamonds)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 53940 obs. of 10 variables:
$ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num 55 61 65 58 58 57 57 55 61 61 ...
$ price : int 326 326 327 334 335 336 336 337 337 338 ...
$ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 1)

ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 0.01)

ggplot(data = diamonds, aes(x = carat)) + geom_histogram() #stat_bin: binwidth defaulted to range/30.

zoom <- coord_cartesian(xlim = c(55, 70))
ggplot(data = diamonds, aes(x = depth)) + geom_histogram(binwidth = 0.2) + zoom

ggplot(data = diamonds, aes(x = depth)) + geom_histogram(aes(fill = cut), binwidth = 0.1) + zoom

ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(fill = cut)) + zoom

ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(color = cut, fill = cut, alpha=0.1)) + zoom

ggplot(data = diamonds, aes(x = price)) +geom_histogram(aes(fill = cut), binwidth = 100)

ggplot(data = diamonds, aes(x = price)) + geom_density(aes(color= cut))

# Visualization of big data
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(aes(color = cut)) # not helpful

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_bin2d()

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_density2d()

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point() + geom_density2d()

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(group = cut))
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(color = cut), method = "loess", se=F)

# ggsave("my-plot.pdf") # PDF more crisp
# ggsave("my-plot.png")
# ggsave("my-plot.pdf", width = 6, height = 6)
# ggsave("my-plot.png", width = 6, height = 6)
head(births, 3)
head(bnames, 3)
# Simple plot
Quentin <- bnames[bnames$name == "Letitia", ]
qplot(year, prop, data = Quentin, geom = "line")

# Interactions
michaels <- bnames[bnames$name == "Quentin" | bnames$name == "Alexis" | bnames$name == "Gina", ]
qplot(year, prop, data = michaels, geom = "line", color = interaction(sex, name))

# dplyr
library(dplyr)
bnames = tbl_df(bnames)
births = tbl_df(births)
class(bnames)
[1] "tbl_df" "tbl" "data.frame"
# filter
vivian = filter(bnames, name == "Vivian")
filter(bnames, sex == "girl" & (year == 1900 | year == 2000))
# Select columns
select(bnames, starts_with("sound"))
# Rename column
rename(iris, petal_length = Petal.Length)
# sort
arrange(bnames, desc(prop))[3,]
# year where Quentin was most popular
arrange(filter(bnames, name == "Quentin"), desc(prop))[1,]
# add columns
mutate(births, double = 2 * births)
# transmute to delete old columns
# Summarize
summarise(vivian,min = min(prop),mean = mean(prop), max = max(prop), number = n(), number_distinct = n_distinct(prop))
bnames2 = left_join(bnames, births, by = c("year","sex"))
bnames2 = mutate(bnames2, n = round(prop * births))
# Group
by_name = group_by(bnames2, name)
by_name
totals = summarise(by_name, total = sum(n))
totals
bnames2
name_sex = group_by(bnames2, name, year)
totals2 = summarise(name_sex, total = sum(n))
head(totals2)
arrange(bnames2, name)[1:3,]
ungroup(by_name)
# other example
year_sex = group_by(bnames2, year, sex)
ytotals = summarise(year_sex, births = sum(n))
ytotals
# ISLR
# Chap 6 Ridge regression / Lasso
# Ex 8
set.seed(1)
X = rnorm(100)
eps = rnorm(100)
beta0 = 3
beta1 = 2
beta2 = -3
beta3 = 0.3
Y = beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + eps
# Use regsubsets to select best model having polynomial of XX of degree 10
library(leaps)
data.full = data.frame(y = Y, x = X)
mod.full = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10)
mod.summary = summary(mod.full)
# Find the model size for best cp, BIC and adjr2
which.min(mod.summary$cp)
[1] 3
which.min(mod.summary$bic)
[1] 3
which.max(mod.summary$adjr2)
[1] 3
# Plot cp, BIC and adjr2
plot(mod.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(3, mod.summary$cp[3], pch = 4, col = "red", lwd = 7)

plot(mod.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(3, mod.summary$bic[3], pch = 4, col = "red", lwd = 7)

plot(mod.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20,
type = "l")
points(3, mod.summary$adjr2[3], pch = 4, col = "red", lwd = 7)

# Coefs found by regression (replaces x^3 by x^7)
coefficients(mod.full, id = 3)
(Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)7
3.07627412 2.35623596 -3.16514887 0.01046843
# Now lasso
library(glmnet)
xmat = model.matrix(y ~ poly(x, 10, raw = T), data = data.full)[, -1] # remove intercept (first column)
mod.lasso = cv.glmnet(xmat, Y, alpha = 1) # cv.glmnet: cross validation to select best lambda
best.lambda = mod.lasso$lambda.min
best.lambda
[1] 0.03991416
plot(mod.lasso)

# Next fit the model on entire data using best lambda
best.model = glmnet(xmat, Y, alpha = 1)
predict(best.model, s = best.lambda, type = "coefficients")
11 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 3.0398151056
poly(x, 10, raw = T)1 2.2303371338
poly(x, 10, raw = T)2 -3.1033192679
poly(x, 10, raw = T)3 .
poly(x, 10, raw = T)4 .
poly(x, 10, raw = T)5 0.0498410763
poly(x, 10, raw = T)6 .
poly(x, 10, raw = T)7 0.0008068431
poly(x, 10, raw = T)8 .
poly(x, 10, raw = T)9 .
poly(x, 10, raw = T)10 .
#Lasso also picks X^5 over X^3. It also picks X^7 with negligible coefficient.
# ISLR
# Chap 7 Non-linear Modeling - Splines, GAM
library(ISLR)
attach(Wage)
The following objects are masked from Wage (pos = 3):
age, education, health, health_ins, jobclass, logwage, maritl, race, region, wage, year
fit=lm(wage~poly(age,4,raw=T),data=Wage) # raw=T to obtain coefficients of poly directly
coef(summary(fit)) # below we see small coef (and p-value) for order 3 & 4
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
poly(age, 4, raw = T)1 2.124552e+01 5.886748e+00 3.609042 0.0003123618
poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
poly(age, 4, raw = T)3 6.810688e-03 3.065931e-03 2.221409 0.0263977518
poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
# Equivalent expressions
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
(Intercept) age I(age^2) I(age^3) I(age^4)
-1.841542e+02 2.124552e+01 -5.638593e-01 6.810688e-03 -3.203830e-05
fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage) # cbind in formulas <-> I() wrapper
coef(fit2b)
(Intercept) cbind(age, age^2, age^3, age^4)age cbind(age, age^2, age^3, age^4)
-1.841542e+02 2.124552e+01 -5.638593e-01
cbind(age, age^2, age^3, age^4) cbind(age, age^2, age^3, age^4)
6.810688e-03 -3.203830e-05
# Prediction
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE) # creates list of ages where we want prediction (see below)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0)) # mar/ oma: margins of plot
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)

age.grid=seq(from=agelims[1],to=agelims[2])
list(age=age.grid) # $age list
$age
[1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
[34] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
predict(fit,list(age=c(70))) # test for 1 age - still need to create an $age list
1
106.9478
preds
$fit
1 2 3 4 5 6 7 8 9 10
51.93145 58.49674 64.57188 70.18273 75.35440 80.11122 84.47676 88.47380 92.12437 95.44973
11 12 13 14 15 16 17 18 19 20
98.47038 101.20601 103.67560 105.89731 107.88856 109.66599 111.24548 112.64214 113.87029 114.94351
21 22 23 24 25 26 27 28 29 30
115.87459 116.67557 117.35770 117.93148 118.40662 118.79210 119.09608 119.32598 119.48846 119.58939
31 32 33 34 35 36 37 38 39 40
119.63388 119.62627 119.57013 119.46827 119.32271 119.13473 118.90481 118.63269 118.31733 117.95691
41 42 43 44 45 46 47 48 49 50
117.54885 117.08980 116.57566 116.00152 115.36174 114.64988 113.85877 112.98043 112.00613 110.92638
51 52 53 54 55 56 57 58 59 60
109.73090 108.40865 106.94784 105.33588 103.55943 101.60437 99.45583 97.09815 94.51491 91.68893
61 62 63
88.60223 85.23611 81.57105
$se.fit
1 2 3 4 5 6 7 8 9 10
5.298268 4.370763 3.592101 2.955813 2.455588 2.083260 1.825676 1.662329 1.566864 1.512533
11 12 13 14 15 16 17 18 19 20
1.477572 1.447355 1.413769 1.373549 1.326690 1.275230 1.222382 1.171865 1.127324 1.091786
21 22 23 24 25 26 27 28 29 30
1.067171 1.053981 1.051263 1.056899 1.068100 1.081939 1.095809 1.107729 1.116537 1.121992
31 32 33 34 35 36 37 38 39 40
1.124827 1.126747 1.130363 1.139015 1.156439 1.186260 1.231415 1.293640 1.373218 1.469069
41 42 43 44 45 46 47 48 49 50
1.579082 1.700568 1.830715 1.966988 2.107496 2.251362 2.399124 2.553187 2.718307 2.902040
51 52 53 54 55 56 57 58 59 60
3.115018 3.370894 3.685814 4.077417 4.563601 5.161438 5.886530 6.752911 7.773327 8.959688
61 62 63
10.323512 11.876272 13.629642
$df
[1] 2995
$residual.scale
[1] 39.91479
# Use of ANOVA
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) # Analysis of variance
Analysis of Variance Table
Model 1: wage ~ age
Model 2: wage ~ poly(age, 2)
Model 3: wage ~ poly(age, 3)
Model 4: wage ~ poly(age, 4)
Model 5: wage ~ poly(age, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 2998 5022216
2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
3 2996 4777674 1 15756 9.8888 0.001679 **
4 2995 4771604 1 6070 3.8098 0.051046 .
5 2994 4770322 1 1283 0.8050 0.369682
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Or obtain these p-values by exploiting the fact that poly() creates orthogonal polynomials:
coef(summary(fit.5)) # and check P-values for coef of each order
Estimate Std. Error t value Pr(>|t|)
(Intercept) 111.70361 0.7287647 153.2780243 0.000000e+00
poly(age, 5)1 447.06785 39.9160847 11.2001930 1.491111e-28
poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
poly(age, 5)3 125.52169 39.9160847 3.1446392 1.679213e-03
poly(age, 5)4 -77.91118 39.9160847 -1.9518743 5.104623e-02
poly(age, 5)5 -35.81289 39.9160847 -0.8972045 3.696820e-01
# Generalized linear function
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial) # plus create binary response on the fly with I()
preds=predict(fit,newdata=list(age=age.grid),se=T) # default: type="link" = predictions for logit
# other option type="response" (probability vs odds)
# Step function
table(cut(age,4))
(17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
750 1399 779 72
fit=lm(wage~cut(age,4),data=Wage) # cut() returns an ordered categorical variable. Breaks= can be used
coef(summary(fit))
Estimate Std. Error t value Pr(>|t|)
(Intercept) 94.158392 1.476069 63.789970 0.000000e+00
cut(age, 4)(33.5,49] 24.053491 1.829431 13.148074 1.982315e-38
cut(age, 4)(49,64.5] 23.664559 2.067958 11.443444 1.040750e-29
cut(age, 4)(64.5,80.1] 7.640592 4.987424 1.531972 1.256350e-01
fit.1=lm(wage~education+age,data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
anova(fit.1,fit.2,fit.3)
Analysis of Variance Table
Model 1: wage ~ education + age
Model 2: wage ~ education + poly(age, 2)
Model 3: wage ~ education + poly(age, 3)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 2994 3867992
2 2993 3725395 1 142597 114.6969 <2e-16 ***
3 2992 3719809 1 5587 4.4936 0.0341 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Splines
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) # 3 knots --> 7 degrees of freedom --> 1 intercept + 6 basis functions
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")

pred=predict(fit,se=T)
plot(age,wage,col="gray")
lines(age,pred$fit,lwd=2) # too many points: that's why we changed age into a list of all unique ages

# That works too - just need to have same length of data in predict() and lines()
age.grid2=sort(unique(age))
pred2=predict(fit,newdata=list(age=age.grid2),se=T)
plot(age,wage,col="gray")
lines(age.grid2,pred2$fit,lwd=2)

str(pred2)
List of 4
$ fit : Named num [1:61] 60.5 62.7 65.8 69.6 73.8 ...
..- attr(*, "names")= chr [1:61] "1" "2" "3" "4" ...
$ se.fit : Named num [1:61] 9.46 5.63 3.78 3.33 3.22 ...
..- attr(*, "names")= chr [1:61] "1" "2" "3" "4" ...
$ df : int 2993
$ residual.scale: num 39.9
dim(bs(age,knots=c(25,40,60), degree = 3)) # bs generates matrix with 6 basis functions (see below)
[1] 3000 6
dim(bs(age,df=6)) # df option to produce a spline with knots at uniform quantiles of the data
[1] 3000 6
attr(bs(age,df=6),"knots")
25% 50% 75%
33.75 42.00 51.00
# Natural spline: ns()
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit2,newdata=data.frame(age=age.grid),se=T)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
lines(age.grid, pred2$fit,col="red",lwd=2)

# Smooth spline
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16) # forces 16 degrees of freedom
fit2=smooth.spline(age,wage,cv=TRUE) # choose value of lambda by cross validation --> 6.8 degrees of freedom
cross-validation with non-unique 'x' values seems doubtful
fit2$df
[1] 6.794596
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)

# Local regresssion (Loess)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage) # span=0.2 means use 20% of observations
fit2=loess(wage~age,span=.5,data=Wage)
lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2)
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)

---
title: "R Notebook"
output:
  html_document: default
  html_notebook: default
---

```{r}
A <- 1:5
B <- 11:15
names(A) <- A
names(B) <- B
```

```{r}
A
B
```

```{r}
View(anscombe)
lm(y3~x3, data = anscombe)
```

```{r}
##-- now some "magic" to do the 4 regressions in a loop:
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  ## or   ff[[2]] <- as.name(paste0("y", i))
  ##      ff[[3]] <- as.name(paste0("x", i))
  mods[[i]] <- lmi <- lm(ff, data = anscombe)
  print(anova(lmi))
}
```

```{r}
## See how close they are (numerically!)
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))
```

```{r}
# Aggregate function
#Splits the data into subsets, computes summary statistics for each, and returns the result in a convenient form
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
                     v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")

# or from Titanic Kagg
#aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
```

```{r}
## Now, do what you should have done in the first place: PLOTS
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma =  c(0, 0, 2, 0))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
       xlim = c(3, 19), ylim = c(3, 13))
  abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
```

```{r}
x = c(TRUE, FALSE, FALSE)
typeof(x)  #logical (vector)
mode(x)
storage.mode(x)
y = 1:10
typeof(y)
mode(y)
storage.mode(y)
```

```{r}
dim(y)
length(y)
```

```{r}
z= list(1, TRUE, 'safs') #trying to get a list
typeof(z)
class(z)
z[3]
length(z)
dim(z)

```

```{r}
quote(x+y)
as.list(quote(x + y))
```

```{r}
1e3L #create constant
1L
```

```{r}
cat(1+2)
'+'(1, 2)
```

```{r}
x <- options()
x$prompt
```

```{r}
match(NA, NaN)
match(NA, NA)
match(NaN, NaN)
```

```{r}
x = array(1:8, c(2,4))
x
i=2
j=3
x[i]
x[i, j]
x[[i]]
x[[i, j]]
```

```{r}
rownames(x)=c("a","b")
x
x = as.data.frame(x)
x
x["a",]
x[]
```

```{r}
i <- matrix(1:4, 2, byrow = TRUE)
i
```

```{r}
i[2,]
i[2, ,drop=FALSE] # keeping dimension 1 * n when selection a row
dim(i[2,])
dim(i[2, ,drop=FALSE])
```

```{r}
# https://cran.r-project.org/doc/manuals/R-intro.pdf

help.start()
```

```{r}
x <- rnorm(10000)
y <- rnorm(x)
plot(x, y)
hist(y)
```

```{r}
ls()
```

```{r}
rm(list=ls())
ls()
```

```{r}
x <- 1:20
w <- 1 + sqrt(x)/2
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
dummy 
```

```{r}
# 4 Ordered and unordered factors
state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
"qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
"sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
"sa", "act", "nsw", "vic", "vic", "act")
statef <- factor(state)
statef
```

```{r}
levels(statef)
```

```{r}
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
59, 46, 58, 43)
incmeans <- tapply(incomes, statef, mean)
incmeans
```

```{r}
# Arrays
a <- array(1:30, dim=c(2, 5,3))
a
```

```{r}
x <- array(1:20, dim=c(4,5)) # Generate a 4 by 5 array.
x
i <- array(c(1:3,3:1), dim=c(3,2))
i
x[i] #get the 3 elements shown by i: (3, 1), (2, 2) and (1, 3)
```

```{r}
help("<-")
```

```{r}
# Back to http://zoonek2.free.fr/UNIX/48_R/02.html
x <- rnorm(10)
x
sort(x)
order(x)
x[order(x)]
```

```{r}
x <- sample(1:5, 10, replace=T)
x
x[order(x)]
unique(x)
```

```{r}
seq(0,10, length=11) == seq(0,10, by=1)
```

```{r}
# Rep
rep(0, 10)
rep(1:5,3)
rep(1:5, each=3)
rep(1:5,2,each=3)
```

```{r}
# Factor
x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) )
x
```

```{r}
# specify levels
l <- c("Yes", "No", "Perhaps")
x <- factor( sample(l, 5, replace=T), levels=l )
x
```

```{r}
table(x)
```

```{r}
# gl: Generate Factor Levels
gl(1,4)
gl(2,4)
gl(2,1,8)
gl(2,1,8, labels=c(T,F))
```

```{r}
x <- gl(2,4)
x
y <- gl(2,1,8)
y
interaction(x,y)
data.frame(x,y, int=interaction(x,y))
```

```{r}
# Cartesian product (toutes les possibilites)
x <- c("A", "B", "C")
y <- 1:2
z <- c("a", "b")
expand.grid(x,y,z)
```

```{r}
x <- factor(c(3,4,5,1))
as.numeric( levels(x))
as.numeric( levels(x)[ x ] ) # proper way to convert from factor to numeric
```

```{r}
# Data Frames
n <- 10
df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) )
str(df)
```

```{r}
summary(df)
```

```{r}
names(df);cat(rownames(df))
```

```{r}
# Merge
merge(x, y)                 # INNER JOIN
merge(x, y, all.x = TRUE)   # LEFT JOIN
merge(x, y, all.y = TRUE)   # RIGHT JOIN
merge(x, y, all   = TRUE)   # OUTER JOIN
#merge(a, b, by=c("y", "z")) # specify what to merge on
```

```{r}
# Regression
data(cars) 
#View(cars)

# Regression
lm.fit=lm( dist ~ speed, data=cars, na.action = na.exclude)
lm.fit
# Polynomial regression
lm.fit3 = lm( dist ~ poly(speed,3), data=cars)

#plot(lm.fit)
plot(cars$speed, cars$dist)
abline(lm.fit)
```

```{r}
# Trees
# install.packages('rattle')
# install.packages('rpart.plot')
# install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
library(RColorBrewer)
```

```{r}
# http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/
# fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
#                data=train,
#                method="class", 
#                control=rpart.control(minsplit=2, cp=0))
# fancyRpartPlot(fit)

# Predict a continuous variable (method anova)
# Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
#                   data=combi[!is.na(combi$Age),], 
#                   method="anova")
```

```{r}
# Lists
h <- list()
h[["foo"]] <- 1
h[["bar"]] <- c("a", "b", "c")
h["bar"] == h[["bar"]] #h["bar"] is a list containing the vector
```

```{r}
# Delete
h[["bar"]] <- NULL
```

```{r}
m <- matrix( c(1,2,3,4), nrow=2 )
m
```

```{r}
solve(m)
x <- matrix( c(6,7), nrow=2 )
solve(m, x)
```

```{r}
?solve
```

```{r}
n <- 1000
x1 <- factor( sample(1:3, n, replace=T), levels=1:3 )
x2 <- factor( sample(LETTERS[1:5], n, replace=T), levels=LETTERS[1:5] )
x3 <- factor( sample(c(F,T),n,replace=T), levels=c(F,T) )
d <- data.frame(x1,x2,x3)
r <- table(d)
```

```{r}
r
```

```{r}
ftable(d) #easier reading
```

```{r}
# contingency table into a data.frame
n <- 100
k <- 10
x <- factor( sample(LETTERS[1:k], n, replace=T), levels=LETTERS[1:k] )
x
d <- table(x)
x2 = factor( rep(names(d),d), levels=names(d) )
x2
```

```{r}
# apply
options(digits=4)
df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20))
apply(df,2,mean)
rownames(df) <- LETTERS[1:20]
apply(df, 1, mean)
```

```{r}
gl(2,10,20)
tapply(1:20, gl(2,10,20), sum) # tapply: 2nd argument used for grouping

by(1:20, gl(2,10,20), sum)
```

```{r}
x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1))
sapply(x,sd) # sapply: apply FUN on each element of vector
lapply(x,sd) # lapply: same but returns list

```

```{r}
# Exercise: Let x be a boolean vector. Count the number of sequences ("runs") of zeros (for instance, in 00101001010110, there are 6 runs: 00 0 00 0 0 0). Count the number of sequences of 1. Counth the total number of sequences. Same question for a factor with more tham two levels.
n <- 50
x <- sample(0:1, n, replace=T, p=c(.2,.8))
x
diff(x, lag=1)


```

```{r}
#Let r be the return of a financial asset. The clustered return is the accumulated return for a sequence of returns of the same sign. The trend number is the number of steps in such a sequence. The average return is their ratio. Compute these quantities.
data(EuStockMarkets)
x <- EuStockMarkets
# We aren't interested in the spot prices, but in the returns
# return[i] = ( price[i] - price[i-1] ) / price[i-1]
y <- apply(x, 2, function (x) { diff(x)/x[-length(x)] })
# We normalize the data
z <- apply(y, 2, function (x) { (x-mean(x))/sd(x) })
# A single time series
r <- z[,1]
# The runs
f <- factor(cumsum(abs(diff(sign(r))))/2)
r <- r[-1]
accumulated.return <- tapply(r, f, sum)
trend.number <- table(f)
boxplot(abs(accumulated.return) ~ trend.number, col='pink',
        main="Accumulated return")
```

```{r}
# Strings
print("Hello\n")

cat("Hello\n") #use cat
```

```{r}
paste("Hello", "World", "!", sep="") #concatenate
paste("Hello ", " World", "!", sep="")
```

```{r}
x <- 5
paste("x=", x)
cat("x=", x, "\n", sep="\n")
```

```{r}
s <- c("Hello", " ", "World", "!")
paste(s)
paste(s, sep="")
paste(s, collapse="")
```

```{r}
paste(1:3, "Hello World!", sep=":")
```

```{r}
nchar("Hello World!")
```

```{r}
s <- "Hello World"
substring(s, 4, 6)
```

```{r}
s <- "foo-->bar-->baz"
strsplit(s, "-->")
```

```{r}
# Regex
s <- "foo, bar, baz"
strsplit(s, ", *")
```

```{r}
s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")
s
```

```{r}
grep("O", s)
grep("O", s, value=T)
```

```{r}
regexpr("o", "Hello")
```

```{r}
regexpr("o", c("Hello", "World!"))
```

```{r}
s <- "foo    bar baz"
gsub(" ", "", s)   # Remove all the spaces
gsub(" +", " ", s)  # Remove multiple spaces and replace them by single spaces

```

```{r}
#The "sub" is similar to "gsub" but only replaces the first occurrence.
s <- "foo bar baz"
sub(" ", "", s)
```

```{r}
# Dates
as.Date("2005-05-15") #ISO 8601
```

```{r}
as.Date("15/05/2005", format="%d/%m/%Y")
as.Date("15/05/05", format="%d/%m/%y")
cat("\n")
as.Date("01/02/03", format="%y/%m/%d")
as.Date("01/02/03", format="%y/%d/%m")

```

```{r}
as.Date("01/02/03", format="%y/%m/%d") - as.Date("01/02/03", format="%y/%d/%m")
```

```{r}
Sys.Date()
format(Sys.Date(), format="%A, %d %B %Y")
```

```{r}
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by="month")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by=31)
```

```{r}
methods(class="Date")
```

```{r}
as.POSIXlt("2005-05-15 21:45:17")
```

```{r}
as.POSIXct(Sys.Date())
```

```{r}
# Reading from dataframes
# option 1
  #d <- read.table("foo.txt")
  #d$Date <- as.Date( as.character( d$Date ) )
# option 2
  #read.table("foo.txt", colClasses=c("Date", "character", rep(10, "numeric")))
```

```{r}

```

```{r}
options(warn=1)
```

```{r}
methods(plot)
```

```{r}
# Import 

# d <- read.table("foo.txt", header=T, sep=",")
# d <- read.csv("txt.csv")
# d <- read.csv2("txt.csv")  # semicolon-separated file, with a
#                            # comma instead of the decimal point.
# d <- read.delim("foo.txt") # Tab-delimited file
# d <- read.fwf("txt.fwf")   # Fixed width fields
```

```{r}
# Excel: this may be trickier: the missing values often appear as "#N/A!" and are mistaken for the start of a comment... You can try

# d <- read.table("foo.csv", header = TRUE, sep = ",", 
#                 na.strings = c("#N/A!", "NA", "@NA"), 
#                 quote = '"',
#                 comment.char = "")
```

```{r}
#If your file only contains number, or only strings, it is wiser to store it in a matrix, not a data.frame. This is what the "scan" function does.
# A numeric matrix
  # x <- scan("foo.txt", sep=",")  # Gives a numeric vector
  # n <- scan("foo.txt", sep=",", nlines=1)
  # x <- matrix(x, nc=n)

# A vector of strings
  #x <- scan("foo.txt", what=character(0))

```

```{r}
# Back tohttps://cran.r-project.org/doc/manuals/R-intro.pdf - Regression

  # fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
  # fm6 <- update(fm05, . ~ . + x6)
  # smf6 <- update(fm6, sqrt(.) ~ .)
```

```{r}
# Spine (from help)
require(graphics)

op <- par(mfrow = c(2,1), mgp = c(2,.8,0), mar = 0.1+c(3,3,3,1))
n <- 9
x <- 1:n
y <- rnorm(n)
plot(x, y, main = paste("spline[fun](.) through", n, "points"))
lines(spline(x, y))
lines(spline(x, y, n = 210), col = 2)
```

```{r}
# NA handling - http://thomasleeper.com/Rcourse/Tutorials/NA.html
g1 <- c(1, 2, NA, NA, NA, 6, 7)
g2 <- na.omit(g1)
g2
```

```{r}
attributes(g2)$na.action
```

```{r}
sum(g1)
sum(g1, na.rm = TRUE)
```

```{r}
# Cor -> can eliminate only pair-wise NAs ()
x <- c(1, 2, 3, NA, 5, 7, 9)
y <- c(3, 2, 4, 5, 1, 3, 4)
z <- c(NA, 2, 3, 5, 4, 3, 4)
m <- data.frame(x, y, z)
m
```

```{r}
cor(m)  # returns all NAs
```

```{r}
cor(m, use = "complete.obs") # Default, all records with NA removed
```

```{r}
cor(m, use = "pairwise.complete.obs") #kept more records for y~x and y~z
```

```{r}
# Defaut for lm is also casewise deletion
lm <- lm(y ~ x + z, data = m)
summary(lm)
```

```{r}
# Checking length of data used for regression
length(m$y)
length(lm$fitted)
```

```{r}
# checking where missing data is
is.na(m) # only useful for small examples
# image
image(is.na(m), main = "Missing Values", xlab = "Observation", ylab = "Variable", 
    xaxt = "n", yaxt = "n", bty = "n")
axis(1, seq(0, 1, length.out = nrow(m)), 1:nrow(m), col = "white")
axis(2, c(0, 0.5, 1), names(m), col = "white", las = 2)
```

```{r}
# to remove casewise:
m2 <- na.omit(m) # use new variable to keep original dataframe
m
m2
```

```{r}
# Mean imputation
x2 <- x
x2[is.na(x2)] <- mean(x2, na.rm = TRUE)
x
x2
```

```{r}
# Random imputation - conserve mean and variance. 
# How: sample rest of the values to fill NAs
x3 <- x
x3[is.na(x3)] <- sample(x3[!is.na(x3)], sum(is.na(x3)), TRUE)
x
x3
```

```{r}
# Saving R data http://thomasleeper.com/Rcourse/Tutorials/savingdata.html 
set.seed(1)
mydf <- data.frame(x = rnorm(10), y = rnorm(10), z = rnorm(10))
```

```{r}
save(mydf, file = "saveddf.RData") # can be loaded by double-clicking on saved file
```

```{r}
unlink("saveddf.RData") # removing file
```

```{r}
# dput to have a readable format (e.g. for stack overflow)
dput(mydf)
```

```{r}
dput(mydf, "saveddf.txt")
```

```{r}
mydf2 <- dget("saveddf.txt")
mydf2
```

```{r}
mydf==mydf2 # due to rounding
```

```{r}
unlink("saveddf.text")
```

```{r}
# csv
write.csv(mydf, file = "saveddf.csv")
unlink("savedf.csv")
```

```{r}
# Dataframe rearrangement

set.seed(50)
mydf <- data.frame(a = rep(1:2, each = 10), b = rep(1:4, times = 5), c = rnorm(20), 
    d = rnorm(20), e = sample(1:20, 20, FALSE))
head(mydf)
```

```{r}
# manual order change

head(mydf[, c("c", "d", "e", "a", "b")])
# mydf <- mydf[, c(3, 4, 5, 1, 2)]
```

```{r}
# using order
order(mydf$e)
head(mydf[order(mydf$e), ]) # ordering on any column
```

```{r}
# Subset

mydf[mydf$a == 1, ]
mydf[mydf$a == 1 & mydf$b > 2, ]

subset(mydf, a == 1 & b > 2)
subset(mydf, select = c("a", "b"))
subset(mydf, a == 1 & b > 2, select = c("c", "d")) # filter rows & columns

```

```{r}
# Splitting 

split(mydf, mydf$a) # -> splitting according to values of a
```

```{r}
split(mydf, list(mydf$a, mydf$b))
```

```{r}
lapply(split(mydf, mydf$a), summary) # perform summary on each of the dataframes
```

```{r}
# sampling: splitting into training and test set
# Option 1
s <- sample(1:nrow(mydf), 5, F) #no replacement
s
```

```{r}
# use 5 rows as training set
mydf[s,]
```

```{r}
# test set
mydf[-s, ]
```

```{r}
# Option 2
s2 <- rbinom(nrow(mydf), 1, 0.2) #won't necessarily give exactly 5 rows
s2
```

```{r}
mydf[s2,]
mydf[-s2,]
```

```{r}
# Recoding vectors
library(car)
```

```{r}
b <- 1:20
#h <- recode(b, "1:5=1: 6:10=2; else=NA") # incredibly this creates an error
e <- recode(b, "1:5=1; 6:10=2; else=NA")
e
f <- recode(b, "lo:5=1; 6:10=2; 11:15=3; 16:hi=4; else=NA")
f
```

```{r}
e <- recode(h, "NA=99")
e
```

```{r}
# Reconding on multiple variables
i <- expand.grid(1:4, 1:2)
i
```

```{r}
interaction(i$Var1, i$Var2) # creates all possible combinations
```

```{r}
# Scaling
set.seed(1)
n <- 30
mydf <- data.frame(x1 = rbinom(n, 1, 0.5), x2 = rbinom(n, 1, 0.1), x3 = rbinom(n, 
    1, 0.5), x4 = rbinom(n, 1, 0.8), x5 = 1, x6 = sample(c(0, 1, NA), n, TRUE))
```

```{r}
str(mydf)
```

```{r}
mydf$x1 + mydf$x2 - mydf$x3 # vector operations
with(mydf, x1+x2-x3) # with to indicate dataframe

```

```{r}
rowSums(mydf)
rowSums(mydf, na.rm = T)
data.frame(1:n, rowSums(mydf, na.rm = T))
```

```{r}
rowMeans(mydf)
```

```{r}
apply(mydf, 1, var) # 2nd argument: 1 for rows, 2 for columns, c(1, 2)  rows & columns.
apply(mydf, 2, var)
sapply(mydf, var) # over list or vector
```

```{r}
# adding a variable
newvar <- numeric(nrow(mydf))
newvar[mydf$x1 == 1] <- with(mydf[mydf$x1 == 1, ], x2 + x3)
newvar[mydf$x1 == 0] <- with(mydf[mydf$x1 == 0, ], x3 + x4 + x5)
newvar
```

```{r}
newvar[mydf$x1 == 1] <- with(mydf, x2 + x3) # here different lengths !
```

```{r}
# Matrices
set.seed(1)
a <- rnorm(100)
quantile(a, c(0.025, 0.975))
quantile(a, seq(0, 1, by = 0.1))
```

```{r}
summary(as.logical(rbinom(1000, 1, 0.5)))
```

```{r}
summary(factor(a)) # for factor, returns all value
```

```{r}
# Tables
set.seed(1)
a <- sample(1:5, 25, T)
a
```

```{r}
table(a)
```

```{r}
prop.table(table(a)) # to obtain percentages
prop.table(table(a)) *100
```

```{r}
cbind(table(a), prop.table(table(a))*100)
```

```{r}
# multi-variate
b <- rep(c(1, 2), length = 25)
table(a, b)
```

```{r}
c <- rep(c(3, 4, 5), length = 25)
table(a, b, c)
```

```{r}
ftable(a, c, c) # provides more compact format
```

```{r}
xtabs(~a + b) # right hand formulas same as table
```

```{r}
x <- table(a, b)
addmargins(x)
```

```{r}
prop.table(table(a, b), 1) # proportions by rows
prop.table(table(a, b), 2)
```

```{r}
# Correlations
set.seed(1)
n <- 1000
x1 <- rnorm(n, -1, 10)
x2 <- rnorm(n, 3, 2)
y <- 5 * x1 + x2 + rnorm(n, 1, 2)
```

```{r}
cor(x1, x2)
cor.test(x1, x2)
```

```{r}
cor(cbind(x1, x2, y)) # input is matrix for correlation matrix
```

```{r}
a <- rnorm(n)
b <- a^2 + rnorm(n)
plot(b~a)
```

```{r}
plot(b ~ a, col = "gray")
curve((x), col = "red", add = TRUE)
curve((x^2), col = "blue", add = TRUE)
```

```{r}
cor(a, b)
cor(a^2, b)
```

```{r}
plot(b~I(a^2), col = "orange")
abline(lm(b~I(a^2)), col = "red")
```

```{r}
layout(matrix(1:2, nrow = 1))
plot(b ~ a, col = "gray")
curve((x^2), col = "blue", add = TRUE)
plot(b ~ I(a^2), col = "gray")
curve((x), col = "blue", add = TRUE)
```

```{r}
# Rounding
height <- c(167, 164, 172, 158, 181, 179)
mean(height)
```

```{r}
signif(mean(height), 4)
round(mean(height), 1)
round(mean(height), -2)
```

```{r}
options(digits = 5)
sd(height)
options(digits = 2)
sd(height)
```

```{r}
options(scipen = -10) #positive value to get fixed notation
10000000
```

```{r}
# sprintf
sprintf("%.3f", pi)
sprintf("%05.1f", pi)
```

```{r}
# Plots as data summary
set.seed(1)
a <- rnorm(30)
hist(a, col = "gray20", border = "lightgray")
```

```{r}
density(a)
plot(density(a))
```

```{r}
hist(a, freq = FALSE, col = "gray20", border = "lightgray")
lines(density(a), col = "red", lwd = 2)
```

```{r}
b <- c(3, 4.5, 5, 8, 3, 6)
barplot(b, names.arg = letters[1:length(b)], horiz = F)
```

```{r}
d <- rbind(c(2, 4, 1), c(6, 1, 3))
d
barplot(d, names.arg = letters[1:3])
```

```{r}
barplot(d, beside = T)
```

```{r}
layout(matrix(1:2, nrow = 1))
barplot(b, names.arg = letters[1:6], horiz = TRUE, las = 2)
dotchart(b, labels = letters[1:6], xlim = c(0, 8))
```

```{r}
boxplot(a)
```

```{r}
e <- rnorm(100, 1, 1)
f <- rnorm(100, 2, 4)
boxplot(e, f)
```

```{r}
g1 <- c(e, f)
g2 <- rep(c(1, 2), each = 100)
boxplot(g1 ~ g2)
```

```{r}
# Scatterplot
x1 <- rnorm(1000)
x2 <- rnorm(1000)
x3 <- x1 + x2
x4 <- x1 + x3
```

```{r}
plot(x1, x2)
plot(x2~x1)
```

```{r}
layout(matrix(1:3, nrow = 1))
plot(x1, x2)
plot(x1, x3)
plot(x1, x4)
```

```{r}
pairs(~x1 + x2 + x3 + x4)
```

```{r}
colors()[1:10]
length(colors())
colors()[600]
```

```{r}
set.seed(100)
z <- sample(1:4, 100, TRUE)
x <- rnorm(100)
y <- rnorm(100)
plot(x, y, pch = 15, col = c("red", "blue"))
```

```{r}
c("red", "blue", "green", "orange")[z]
plot(x, y, pch = 15, col = c("red", "blue", "green", "orange")[z]) #indexing colors on z groups
```

```{r}
# Analysis of variance 

set.seed(100)
tr <- rep(1:4, each = 30)
y <- numeric(length = 120)
y[tr == 1] <- rnorm(30, 5, 1)
y[tr == 2] <- rnorm(30, 4, 2)
y[tr == 3] <- rnorm(30, 4, 5)
y[tr == 4] <- rnorm(30, 1, 2)
```

```{r}
aov(y~tr)
```

```{r}
summary(aov(y ~ factor(tr)))
```

```{r}
oneway.test(y ~ tr)
```

```{r}
oneway.test(y ~ factor(tr), var.equal = TRUE)
```

```{r}
by(y, tr, FUN = mean)
```

```{r}
tapply(y, tr, FUN = mean) # same thing
```

```{r}
# Distributions
options(scipen = F)
options(digits = 5)
dnorm(0)  # density
dnorm(0, mean=-1)
```

```{r}
pnorm(0)  # cumulative
pnorm(1.65) # 95% normal confidence interval
pnorm(1.96)
pnorm(1.96) - pnorm(-1.96)
```

```{r}
qnorm(c(0.025, 0.975))  # quantile
pnorm(qnorm(c(0.025, 0.975)))
```

```{r}
# other distribution
dbinom(0, 1, 0.5)
pbinom(0, 1, 0.5)
qbinom(.95, 1, 0.5) # because binomial discrete distribution
```

```{r}
# Formulae
myformula <- ~x
class(myformula)
```

```{r}
# interactions
y ~ x1 * x2
y ~ x1:x2 # without the variables themselves 
y ~ -1 + x1 * x2 # drop the intercept
y ~ x + I(x^2) # without I() R thinks x^2 is a duplicate
```

```{r}
# As strings
("y ~ x") == (y ~ x)
as.formula("y~x")
as.character(y ~ x) # Formula indexed with operand first
```

```{r}
terms(y ~ x1 + x2)
```

```{r}
update(y ~ x, ~. + x2)
update(y ~ x, z ~ .)
```

```{r}
# Bivariate Regression

set.seed(1)
bin <- rbinom(1000, 1, 0.5)

out <- 2 * bin + rnorm(1000)

by(out, bin, mean)
```

```{r}
t.test(out ~ bin)
```

```{r}
lm(out ~ bin) # slope = mean difference
```

```{r}
summary(lm(out~bin))$coef
```

```{r}
plot(out ~ bin, col = "gray")
points(0:1, by(out, bin, mean), col = "blue", bg = "blue", pch = 23)
abline(coef(lm(out ~ bin)), col = "blue")
```

```{r}
set.seed(1)
x <- runif(1000, 0, 10)
y <- 3 * x + rnorm(1000, 0, 5)
```

```{r}
# glm plots
set.seed(1)
n <- 100
x <- runif(n, 0, 1)
y <- rbinom(n, 1, x) # more outcomes of 1 as x -> 1
```

```{r}
plot(y ~ x, col = NULL, bg = rgb(0, 0, 0, 0.5), pch = 21) # bg: background color (for points)
abline(lm(y ~ x), lwd = 2) # lwd: line width (default: 1)
# here linear doesn't work
```

```{r}
m1 <- glm(y ~ x, family = binomial(link = "logit"))
```

```{r}
newdf <- data.frame(x = seq(0, 1, length.out = 100))
newdf
```

```{r}
newdf$pout_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$fit
newdf[95:100,]
```

```{r}
# build confidence intervals from standard error
newdf$pse_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$se.fit
newdf$plower_logit <- newdf$pout_logit - (1.96 * newdf$pse_logit)  # 95% CI lower bound
newdf$pupper_logit <- newdf$pout_logit + (1.96 * newdf$pse_logit)  # 95% CI upper bound
# qnorm(c(0.025, 0.975)) = (-1.96, +1.96)
```

```{r}
newdf[1:10,c(1,2,3,5,4)]
```

```{r}
# now plot predicted values
with(newdf, plot(pout_logit ~ x, type = "l", lwd = 2))
with(newdf, lines(pupper_logit ~ x, type = "l", lty = 2))
with(newdf, lines(plower_logit ~ x, type = "l", lty = 2))
```

```{r}
library(ggplot2)
# http://ggplot2.tidyverse.org/reference/

# basic R's plot
plot(iris$Sepal.Width, iris$Sepal.Length)
```

```{r}
head(mpg)
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(method = "lm") # with regression
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class)) # adding a dimension
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(size = class))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(shape = class))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(alpha = class))
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cyl) # 2D grid, rows ~ cols
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ .) 
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ cyl)
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_wrap( ~ class)
# facet_wrap wraps a 1d sequence of panels into 2d. This is generally a better use of screen space than facet_grid because most displays are roughly rectangular.
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) +  geom_point() + geom_smooth()
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE)  # Turn off confidence band
```

```{r}
ggplot(data = mpg, aes(x = class, y = hwy)) + geom_boxplot() # scatterplot
ggplot(data = mpg, aes(x = reorder(class, hwy), y = hwy)) + geom_boxplot() # reorder (mean)
ggplot(data = mpg, aes(x = reorder(class, hwy, median), y = hwy)) + geom_boxplot() # reorder by median
```
```{r}
# jitter
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point()
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point(position = "jitter")
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_jitter() # identical option
```
```{r}
names(diamonds) # part of ggplot2 package
ggplot(data = diamonds,aes(x =cut)) + geom_bar(aes(fill =cut)) # fill: color inside of bars
ggplot(data = diamonds, aes(x =cut)) + geom_bar(aes(color =cut)) # color: line around the bars
ggplot(data = diamonds, aes(x = color)) + geom_bar(aes(fill = cut), position = "dodge")
```



```{r}
str(diamonds)
ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 1)
ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 0.01)
ggplot(data = diamonds, aes(x = carat)) + geom_histogram() #stat_bin: binwidth defaulted to range/30.
```

```{r}
zoom <- coord_cartesian(xlim = c(55, 70))
ggplot(data = diamonds, aes(x = depth)) + geom_histogram(binwidth = 0.2) + zoom
```

```{r}
ggplot(data = diamonds, aes(x = depth)) + geom_histogram(aes(fill = cut), binwidth = 0.1) + zoom
```

```{r}
# to compare variables
ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(fill = cut)) + zoom
ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(color = cut, fill = cut, alpha=0.1)) + zoom
```

```{r}
ggplot(data = diamonds, aes(x = price)) +geom_histogram(aes(fill = cut), binwidth = 100)
ggplot(data = diamonds, aes(x = price)) + geom_density(aes(color= cut)) # better
```

```{r}
# Visualization of big data
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(aes(color = cut)) # not helpful
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_bin2d()
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_density2d()
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point() + geom_density2d()
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(group = cut))
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(color = cut), method = "loess", se=F)
```

```{r}
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(size = 0.5, alpha = 0.1)
```

```{r}
# ggsave("my-plot.pdf") # PDF more crisp
# ggsave("my-plot.png")
# ggsave("my-plot.pdf", width = 6, height = 6)
# ggsave("my-plot.png", width = 6, height = 6)
```

```{r}
# More plot examples
bnames = read.csv("bnames.csv", stringsAsFactors = FALSE) 
births = read.csv("births.csv", stringsAsFactors = FALSE)
head(births, 3)
head(bnames, 3)
```

```{r}
# Simple plot
Quentin <- bnames[bnames$name == "Letitia", ] 
qplot(year, prop, data = Quentin, geom = "line")
```

```{r}
# Interactions
michaels <- bnames[bnames$name == "Quentin" | bnames$name == "Alexis" | bnames$name == "Gina", ]
qplot(year, prop, data = michaels, geom = "line", color = interaction(sex, name))
```

```{r}
# dplyr
library(dplyr)
bnames = tbl_df(bnames) 
births = tbl_df(births) 
class(bnames)
```

```{r}
# filter
filter(bnames, sex == "girl" & (year == 1900 | year == 2000))
```

```{r}
# Select columns
select(bnames, starts_with("sound"))
# Rename column
rename(iris, petal_length = Petal.Length)
```

```{r}
# sort
arrange(bnames, desc(prop))[3,]
# year where Quentin was most popular
arrange(filter(bnames, name == "Quentin"), desc(prop))[1,]
```

```{r}
# add columns
mutate(births, double = 2 * births)
# transmute to delete old columns
```

```{r}
# Summarize
summarise(vivian,min = min(prop),mean = mean(prop), max = max(prop), number = n(), number_distinct = n_distinct(prop))
```

```{r}

bnames2 = left_join(bnames, births, by = c("year","sex"))
bnames2 = mutate(bnames2, n = round(prop * births))
# Group
by_name = group_by(bnames2, name)
by_name
totals = summarise(by_name, total = sum(n)) 
totals
```

```{r}
bnames2
name_sex = group_by(bnames2, name, year) 
totals2 = summarise(name_sex, total = sum(n)) 
head(totals2)
```

```{r}
arrange(bnames2, name)[1:3,]
```

```{r}
ungroup(by_name)
```

```{r}
# other example
year_sex = group_by(bnames2, year, sex) 
ytotals = summarise(year_sex, births = sum(n)) 
ytotals
```

```{r}
bnames2
```

```{r}
# ISLR 
# Chap 6 Ridge regression / Lasso
# Ex 8
set.seed(1)
X = rnorm(100)
eps = rnorm(100)

beta0 = 3
beta1 = 2
beta2 = -3
beta3 = 0.3
Y = beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + eps

# Use regsubsets to select best model having polynomial of XX of degree 10
library(leaps)
data.full = data.frame(y = Y, x = X)
mod.full = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10)
mod.summary = summary(mod.full)

# Find the model size for best cp, BIC and adjr2
which.min(mod.summary$cp)
which.min(mod.summary$bic)
which.max(mod.summary$adjr2)

# Plot cp, BIC and adjr2
plot(mod.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(3, mod.summary$cp[3], pch = 4, col = "red", lwd = 7)
plot(mod.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(3, mod.summary$bic[3], pch = 4, col = "red", lwd = 7)
plot(mod.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, 
    type = "l")
points(3, mod.summary$adjr2[3], pch = 4, col = "red", lwd = 7)

# Coefs found by regression (replaces x^3 by x^7)
coefficients(mod.full, id = 3)

```

```{r}
# Now lasso
library(glmnet)
xmat = model.matrix(y ~ poly(x, 10, raw = T), data = data.full)[, -1] # remove intercept (first column)
mod.lasso = cv.glmnet(xmat, Y, alpha = 1) # cv.glmnet: cross validation to select best lambda
best.lambda = mod.lasso$lambda.min
best.lambda
plot(mod.lasso)

# Next fit the model on entire data using best lambda
best.model = glmnet(xmat, Y, alpha = 1)
predict(best.model, s = best.lambda, type = "coefficients")
#Lasso also picks X^5 over X^3. It also picks X^7 with negligible coefficient.

```

```{r}
# ISLR 
# Chap 7 Non-linear Modeling - Splines, GAM
library(ISLR)
attach(Wage)
```

```{r}
fit=lm(wage~poly(age,4,raw=T),data=Wage)  # raw=T to obtain coefficients of poly directly
coef(summary(fit)) # below we see small coef (and p-value) for order 3 & 4

# Equivalent expressions
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage) # cbind in formulas <-> I() wrapper
coef(fit2b)
```

```{r}
# Prediction
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE) # creates list of ages where we want prediction (see below)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0)) # mar/ oma: margins of plot
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
```

```{r}
age.grid=seq(from=agelims[1],to=agelims[2]) # Or: age.grid2=sort(unique(age))
list(age=age.grid) # $age list

predict(fit,list(age=c(70))) # test for 1 age - still need to create an $age list
```

```{r}
preds
```

```{r}
# Use of ANOVA = Analysis of variance. To determined which degree is needed
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) 
# Or obtain these p-values by exploiting the fact that poly() creates orthogonal polynomials:
coef(summary(fit.5)) # and check P-values. Each p-value corresponds to comparison from model w previous one
```

```{r}
# Generalized linear function
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial) # plus create binary response on the fly with I()
preds=predict(fit,newdata=list(age=age.grid),se=T) # default: type="link" = predictions for logit
# other option type="response" (probability vs odds)
```

```{r}
# Step function
table(cut(age,4))
fit=lm(wage~cut(age,4),data=Wage) # cut() returns an ordered categorical variable. Breaks= can be used 
coef(summary(fit))

```

```{r}
fit.1=lm(wage~education+age,data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
anova(fit.1,fit.2,fit.3) # Anova can compare not orthogonal polynomials, and that have other terms
```

```{r}
# Splines
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) # 3 knots --> 7 degrees of freedom --> 1 intercept + 6 basis functions
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
```

```{r}
pred=predict(fit,se=T)
plot(age,wage,col="gray")
lines(age,pred$fit,lwd=2) # too many points: that's why we changed age into a list of all unique ages
```

```{r}
# That works too - just need to have same length of data for x and y in lines(x, y...)
# = bewtween age.grid2 and pred2$fit
age.grid2=sort(unique(age))
pred2=predict(fit,newdata=list(age=age.grid2),se=T)
plot(age,wage,col="gray")
lines(age.grid2,pred2$fit,lwd=2)
```

```{r}
str(pred2)
```

```{r}
dim(bs(age,knots=c(25,40,60), degree = 3)) # bs generates matrix with 6 basis functions (see below)
dim(bs(age,df=6)) # df option to produce a spline with knots at uniform quantiles of the data
attr(bs(age,df=6),"knots")
```

```{r}
# Natural spline: ns()
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit2,newdata=list(age=age.grid),se=T) #list or data.frame for predit(<class lm>, ..)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
lines(age.grid, pred2$fit,col="red",lwd=2)
```

```{r}
# Smooth spline
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16) # forces 16 degrees of freedom
fit2=smooth.spline(age,wage,cv=TRUE) # choose value of lambda by cross validation --> 6.8 degrees of freedom
fit2$df
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
```

```{r}
# Local regresssion (Loess)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage) # span=0.2 means use 20% of observations
fit2=loess(wage~age,span=.5,data=Wage) # 50% of observations --> smoother
lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2) # predict needs data.frame for loess fit
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

